Average Household Size

In [161]:
m = folium.Map(location=[34.4, -81.1], zoom_start=10.5)

folium.Choropleth(
    geo_data=geojson,
    name='Household Size',
    data=cbg_data,
    columns=['census_block_group', 'household_size'], # , # ['State', 'Unemployment'],
    key_on='feature.properties.CensusBlockGroup',
    fill_color='Reds',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Average Household Size (# of people)'
).add_to(m)

folium.LayerControl().add_to(m)

m
Out[161]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Median Household Income

In [149]:
m = folium.Map(location=[34.4, -81.1], zoom_start=10.5)

folium.Choropleth(
    geo_data=geojson,
    name='Median Household Income',
    data=cbg_data,
    columns=['census_block_group', 'B19013e1'], # B01001e1, # ['State', 'Unemployment'],
    key_on='feature.properties.CensusBlockGroup',
    fill_color='Greens',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Median Household Income ($)'
).add_to(m)

m
Out[149]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Population

In [122]:
m = folium.Map(location=[34.4, -81.1], zoom_start=10.5)

folium.Choropleth(
    geo_data=geojson,
    name='Population',
    data=cbg_data,
    columns=['census_block_group', 'B01001e1'], # , # ['State', 'Unemployment'],
    key_on='feature.properties.CensusBlockGroup',
    fill_color='Blues',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Population (# of people)'
).add_to(m)

m
Out[122]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [73]:
import os
from pathlib import Path
import json
import linecache
import functools

import pandas as pd
import geopandas as gpd 
import matplotlib.pyplot as plt
import folium 
from shapely.ops import nearest_points
from shapely.geometry import LineString
In [3]:
DATA_PATH = Path('/data/safegraph/safegraph_open_census_data')
PREPROCESSED_DATA_PATH = Path('../../../data/preprocessed/safegraph/safegraph_open_census_data')
In [100]:
# Only needs to be run once
county_fips_code = '45039'  # Fairfield County, South Carolina

!mkdir -p {PREPROCESSED_DATA_PATH}
census_data_file_names = !ls {DATA_PATH}/data/ | grep [0-9] #  | cut -f 1 -d .  # eliminate .csv suffix
county_directory = PREPROCESSED_DATA_PATH / "data/county" / county_fips_code
!mkdir -p {county_directory}
for file_name in census_data_file_names:
    !touch {county_directory}/{file_name}
    print(county_directory/file_name)
    !head -n 1 "{DATA_PATH}/data/{file_name}" > {county_directory}/{file_name}
    !cat "{DATA_PATH}/data/{file_name}" | grep ^{county_fips_code}.*$ >> {county_directory}/{file_name}
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b00.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b01.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b02.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b03.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b07.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b08.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b09.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b11.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b12.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b14.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b15.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b19.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b20.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b21.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b22.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b23.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b25.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b27.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b99.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_c16.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_c17.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_c24.csv
In [155]:
# See: https://www.safegraph.com/blog/beginners-guide-to-census
    
table_ids = [
    'B01001e1',   # SEX BY AGE: Total: Total population -- (Estimate),Sex By Age, Total, Total Population -- (Estimate),,,,,
    #'B00001e1',   # UNWEIGHTED SAMPLE COUNT OF THE POPULATION: Total: Total population -- (Estimate),Unweighted Sample Count Of The Population, Total, Total Population -- (Estimate),,,,,
    #'B00001m1',   # UNWEIGHTED SAMPLE COUNT OF THE POPULATION: Total: Total population -- (Margin of Error),Unweighted Sample Count Of The Population, Total, Total Population -- (Margin Of Error),,,,,
    'B19013e1',   # Median Household Income
    #'B00002e1',   # UNWEIGHTED SAMPLE HOUSING UNITS: Total: Housing units -- (Estimate),Unweighted Sample Housing Units, Total, Housing Units -- (Estimate),,,,,
    #'B00002m1',   # UNWEIGHTED SAMPLE HOUSING UNITS: Total: Housing units -- (Margin of Error),Unweighted Sample Housing Units, Total, Housing Units -- (Margin Of Error),,,,,
    'B25001e1',   # HOUSING UNITS: Total: Housing units -- (Estimate),Housing Units, Total, Housing Units -- (Estimate),,,,,
    #'B25001m1',   # HOUSING UNITS: Total: Housing units -- (Margin of Error),Housing Units, Total, Housing Units -- (Margin Of Error),,,,,

    'C17002e1',   #RATIO OF INCOME TO POVERTY LEVEL IN THE PAST 12 MONTHS: Total: Population for whom poverty status is determined -- (Estimate),Ratio Of Income To Poverty Level In The Past 12 Months, Total, Population For Whom Poverty Status Is Determined -- (Estimate),,,,,
    
    # unavailable:
    # 'B17017e10',   # POVERTY STATUS IN THE PAST 12 MONTHS BY HOUSEHOLD TYPE BY AGE OF HOUSEHOLDER: Total: Households -- (Estimate),Poverty Status In The Past 12 Months By Household Type By Age Of Householder, Total, Households -- (Estimate),,,,,
]

cbg_field_desc = pd.read_csv(DATA_PATH / 'metadata/cbg_field_descriptions.csv')
cbg_field_desc[cbg_field_desc.table_id.isin(table_ids)]
Out[155]:
table_id field_full_name field_level_1 field_level_2 field_level_3 field_level_4 field_level_5 field_level_6 field_level_7 field_level_8
4 B01001e1 SEX BY AGE: Total: Total population -- (Estimate) Sex By Age Total Total Population -- (Estimate) NaN NaN NaN NaN NaN
3094 B19013e1 MEDIAN HOUSEHOLD INCOME IN THE PAST 12 MONTHS ... Median Household Income In The Past 12 Months ... Total Households -- (Estimate) NaN NaN NaN NaN NaN
4394 B25001e1 HOUSING UNITS: Total: Housing units -- (Estimate) Housing Units Total Housing Units -- (Estimate) NaN NaN NaN NaN NaN
7016 C17002e1 RATIO OF INCOME TO POVERTY LEVEL IN THE PAST 1... Ratio Of Income To Poverty Level In The Past 1... Total Population For Whom Poverty Status Is Determi... NaN NaN NaN NaN NaN
In [157]:
county_files = ['cbg_b00.csv', 'cbg_b01.csv', 'cbg_b19.csv', 'cbg_b25.csv', 'cbg_c17.csv'] #  !ls {county_directory}

dfs = [pd.read_csv(county_directory / file, dtype={'census_block_group': str}) for file in county_files]
merged = dfs[0]
for df in dfs[1:]:
    merged = pd.merge(merged, df, on=['census_block_group'])
cbg_data = merged
    
#combiner = lambda left, right: pd.merge(left, right, on=['census_block_group'])
#cbg_data = functools.reduce(combiner, dfs[0], dfs[1:])


'''cbg_b19 = pd.read_csv(county_directory / 'cbg_b19.csv', dtype={'census_block_group': str})
cbg_b01 = pd.read_csv(county_directory / 'cbg_b01.csv', dtype={'census_block_group': str})
cbg_data = pd.merge(cbg_b01, cbg_b19, on=['census_block_group'])'''
cbg_data = cbg_data[['census_block_group'] + table_ids]
#cbg_data.dropna().head()
cbg_data
Out[157]:
census_block_group B01001e1 B19013e1 B25001e1 C17002e1
0 450399601001 1101 29250.0 624 1101
1 450399601002 931 39931.0 523 931
2 450399602001 1015 46898.0 517 1015
3 450399602002 1260 30481.0 782 1260
4 450399602003 1467 48125.0 642 1409
5 450399603001 1046 68902.0 1079 1046
6 450399603002 890 NaN 449 890
7 450399603003 1482 30625.0 678 1387
8 450399603004 2257 31346.0 982 2132
9 450399604001 962 26250.0 451 933
10 450399604002 822 NaN 462 822
11 450399604003 660 70050.0 465 660
12 450399604004 815 21024.0 550 815
13 450399604005 2115 20601.0 938 2070
14 450399605001 2015 57375.0 609 2015
15 450399605002 1236 48722.0 538 1236
16 450399605003 1596 33125.0 684 1596
17 450399605004 1355 31306.0 689 1355
In [158]:
sum(cbg_data['B01001e1'])  # total population
Out[158]:
23025
In [159]:
cbg_data['household_size'] = cbg_data['B01001e1'] / cbg_data['B25001e1']
<ipython-input-159-a1e05c5d3218>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbg_data['household_size'] = cbg_data['B01001e1'] / cbg_data['B25001e1']
In [160]:
cbg_data
Out[160]:
census_block_group B01001e1 B19013e1 B25001e1 C17002e1 household_size
0 450399601001 1101 29250.0 624 1101 1.764423
1 450399601002 931 39931.0 523 931 1.780115
2 450399602001 1015 46898.0 517 1015 1.963250
3 450399602002 1260 30481.0 782 1260 1.611253
4 450399602003 1467 48125.0 642 1409 2.285047
5 450399603001 1046 68902.0 1079 1046 0.969416
6 450399603002 890 NaN 449 890 1.982183
7 450399603003 1482 30625.0 678 1387 2.185841
8 450399603004 2257 31346.0 982 2132 2.298371
9 450399604001 962 26250.0 451 933 2.133038
10 450399604002 822 NaN 462 822 1.779221
11 450399604003 660 70050.0 465 660 1.419355
12 450399604004 815 21024.0 550 815 1.481818
13 450399604005 2115 20601.0 938 2070 2.254797
14 450399605001 2015 57375.0 609 2015 3.308703
15 450399605002 1236 48722.0 538 1236 2.297398
16 450399605003 1596 33125.0 684 1596 2.333333
17 450399605004 1355 31306.0 689 1355 1.966618
In [136]:
# Census Block Groups
In [16]:
def geojson_for_county(state_abbreviation="SC",
                       county_name="Fairfield County",
                       geojson_path=DATA_PATH / "geometry/cbg.geojson"):
    header = !head -n 5 {geojson_path}
    footer = !tail -n 2 {geojson_path}
    
    # lines to search file for county of interest.
    # must be found by inspection using "tail | head" method below, and checking whether 
    # the state of interest is included.
    # If not included, search up or down via binary search (file is sorted by state)
    # TODO: write the binary search explicitly here, if we need to generalize to other states/counties
    line_start_search = 170000
    line_end_search = 180000
    num_lines = line_end_search - line_start_search
    
    # stream = os.popen(f"""< {geojson_path} tail -n +{line_start_search} | head -n {num_lines} | grep '"State": "{state_abbreviation}", "County": "{county_name}"'  """)
    #stream = os.popen(f"""cat {geojson_path} | tail -n +{line_start_search} | head -n {num_lines} | grep '"State": "{state_abbreviation}", "County": "{county_name}"'  """)
    #county_cbgs = stream.readlines()
    
    county_cbgs = [linecache.getline(str(geojson_path), line_number).strip() for line_number in range(line_start_search, line_end_search)]
    county_cbgs = [line for line in county_cbgs if f'"State": "{state_abbreviation}", "County": "{county_name}"' in line]
    
    print(len(county_cbgs))
    
    # remove final character from last entry in list:
    # a trailing "," that will mess up the JSON parsing
    if county_cbgs[-1][-1] == ',':
        county_cbgs[-1] = county_cbgs[-1][:-1]
    
    return json.loads('\n'.join(header + county_cbgs + footer))
    
In [17]:
# only needs to be run once
geojson = geojson_for_county()
18
In [20]:
#len([f['properties']['CensusBlockGroup'] for f in geojson['features']])
#[f['properties']['CensusBlockGroup'] for f in cbgs_json['features']]
Out[20]:
18
In [24]:
#geojson['features'][0]['properties']
Out[24]:
{'StateFIPS': '45',
 'CountyFIPS': '039',
 'TractCode': '960300',
 'BlockGroup': '2',
 'CensusBlockGroup': '450399603002',
 'State': 'SC',
 'County': 'Fairfield County',
 'ClassCode': 'H1'}
In [ ]: